import requests

from lxml import etree

url = "http://8btc.com/forum-61-2.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}

# 当前网站的 字符集 是 gb2312 所以 decodede 时候 是 gbk
data = requests.get(url, headers=headers).content.decode('gbk')

print(data)
# xpath 解析
parse_data = etree.HTML(data)


# 解析 整个列表页的 论坛标题 和 url

# 先取出大的节点 form
form = parse_data.xpath('//form[@id="moderate"]')[0]
div_list = form.xpath('div')

# 第一个 div 不符合我的要求 直接删除
div_list.pop(0)
div_list.pop(1)  # 广告

index = 0
for div in div_list:
    index += 1
    child_a = div.xpath('./div[2]/div/a[@class="xst"]/text()')[0]
    child_url = div.xpath('./div[2]/div/a[@class="s xst"]/@href')[0]
    print(child_url)

print(index)

# 解析 论坛的标题
# title = parse_data.xpath('//*[@id="normalthread_218441"]/div[2]/div/a[1]/text()')[0]
# title_url = parse_data.xpath('//*[@id="normalthread_218429"]/div[2]/div/a[1]/@href')[0]
#
# print(title + title_url)


#
# with open('02btx.html', 'w') as f:
#     f.write(data)